2/16/2021 can use smartseq make a gene tissue gmt
library(tidyverse)
# library(readr)
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
library(pheatmap)
library(caret)
library(ReactomePA)
library(annotate)
library(seqinr)
save_path = '/Users/mguo123/Google Drive/1_khavari/noncancer_project/miseq/novogene_071420/scrna_enrichment/'
path_genes = '/Users/mguo123/Google Drive/1_khavari/noncancer_project/miseq/novogene_071420/D_mpraanalyze_barcode_allelic//egene_gtex/'
diseases = sort(c('schizo','depress_updatedhoward','bipolar','anxiety','attent','personality','panic','traum','autism','ocd'))
diseases_all = c(diseases, 'all')
get_genes = function(dz){
file = str_c(path_genes,dz,'.txt')
genes = read.table(file,header=F,stringsAsFactor=F)$V1
print(dz)
print(length(genes))
return(genes)
}
get_entrez = function(genes){
entrez_ids = bitr(genes, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")$ENTREZID
# print(length(entrez_ids))
return(entrez_ids)
# return(bitr(genes, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db"))
}
enrichr_res = function(entrez_genes, c_gmt){
enrichr_res <- enricher(entrez_genes, TERM2GENE=c_gmt,
minGSSize=0,
maxGSSize=1000,
pAdjustMethod = "fdr",
qvalueCutoff = 1,
pvalueCutoff = 1)
if (!is.null(enrichr_res)){
enrichr_res <- setReadable(enrichr_res, OrgDb=org.Hs.eg.db, keyType="ENTREZID")
return(data.frame(enrichr_res))}
return(data.frame())
}
filt_res = function(df){
return(filter(df,p.adjust<0.05))
}
get_lengths = function(list_dfs){
for (name in names(list_dfs)){
print(name)
if (is.data.frame(list_dfs[[name]])){
print(dim(list_dfs[[name]])[1])
}
else{
print(length(list_dfs[[name]]))
}
}
}
save_dfs = function(list_dfs, save_prefix){
for (name in names(list_dfs)){
if (dim(list_dfs[[name]])[1]>0){
save_filepath = paste0(save_prefix, '_', name,'.csv')
write.csv(list_dfs[[name]], save_filepath)
}
}
}
# get list of unique genes in list of lists
get_genes_unique = function(list_dfs){
genelist = sort(unique(do.call('c',lapply(do.call('rbind',list_dfs)$geneID, function(x) strsplit(x,'/')[[1]]))))
return(genelist)
}
run_enrichment = function(entrez_list, c_gmt, save_prefix){
enrich_df_list = sapply(entrez_list, function(x) enrichr_res(x,c_gmt))
save_dfs(enrich_df_list, save_prefix)
enrich_df_list_filt = sapply(enrich_df_list, filt_res)
get_lengths(enrich_df_list_filt)
print('getting unique genes --all')
print(get_genes_unique (enrich_df_list) )
print('getting unique genes --pval filt')
print(get_genes_unique (enrich_df_list_filt) )
return(enrich_df_list_filt)
}
# rna_df is samples x genes and removed low variances genes
remove_lowvar_genes = function(rna_df){
nzv_cols <- nearZeroVar(rna_df)
print(dim(rna_df))
if(length(nzv_cols) > 0) rna_df <- rna_df[, -nzv_cols]
print(dim(rna_df))
return(rna_df)
}
save_pheatmap_png <- function(x, filename, width=1200, height=1000, res = 200) {
png(filename, width = width, height = height, res = res)
grid::grid.newpage()
grid::grid.draw(x$gtable)
dev.off()
}
save_pheatmap_pdf <- function(x, filename, width=7, height=7) {
stopifnot(!missing(x))
stopifnot(!missing(filename))
pdf(filename, width=width, height=height)
grid::grid.newpage()
grid::grid.draw(x$gtable)
dev.off()
}
# get_expr
file = '/Users/mguo123/Google Drive/1_khavari/noncancer_project/miseq/novogene_071420/D_mpraanalyze_barcode_allelic/interesting_egenes.csv'
interesting_egenes = read.csv(file,header=F,stringsAsFactor=F)$V1
# interesting_egenes
file = '/Users/mguo123/Google Drive/1_khavari/noncancer_project/miseq/novogene_071420/D_mpraanalyze_barcode_allelic/interesting_egenes.csv'
interesting_egenes = read.csv(file,header=F,stringsAsFactor=F)$V1
# interesting_egenes
dz_genes = sapply(diseases_all, get_genes)
dz_gene_list_entrez = sapply(dz_genes,get_entrez)
get_lengths(dz_gene_list_entrez)
# get entrez
length(dz_genes$all)
all_sym_to_entrez = bitr(dz_genes$all, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")#$ENTREZID
str(all_sym_to_entrez)
metadata_df = read.csv('../data/external/brainmap_cortex_smartseq/metadata.csv')
head(metadata_df)
## efforts to try to annotate with cortical layer and region label, might need another heatmap?? (toDO later)
metadata_df%>%
dplyr::select(cluster_label, class_label,subclass_label,region_label,cortical_layer_label)%>%
mutate(cluster_id = str_replace_all(cluster_label, "[ .-]", "."))%>%
filter(cluster_label!='')%>%
distinct()%>%
group_by(cluster_id, class_label, region_label)%>%#,cortical_layer_label)%>%
tally()
# brainmap_expr_df_region = read.csv('../data/processed/fig1/rna_10xm1/brainmap_expr_df_region.csv',row.names=1)
brainmap_expr_df_region = read.csv('../data/external/brainmap_cortex_smartseq/medians.csv',row.names=1)%>%dplyr::select(-X)
brainmap_expr_df_region = data.frame(t(brainmap_expr_df_region))
dim(brainmap_expr_df_region)
head(brainmap_expr_df_region[,1:5])
# get entrez
length(dz_genes$all)
all_sym_to_entrez = bitr(dz_genes$all, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")#$ENTREZID
str(all_sym_to_entrez)
# egenes expressed in a brain tissue
geneset = dz_genes$all#unique(c(dz_genes$all,interesting_egenes))
region_gene_df = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df)
region_gene_df = region_gene_df[,colSums(region_gene_df)>0]
dim(region_gene_df)
294/416
70% of egenes are expressed in at least 1 brain cortex cell type (SMART seq 2018)
# select egenes from expr df
geneset = intersect(dz_genes$all, interesting_egenes)#unique(c(dz_genes$all,interesting_egenes))
region_gene_df = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df)
# region_gene_df[region_gene_df<=4] = 0
region_gene_df = remove_lowvar_genes(region_gene_df)
region_gene_df_log = log2(t(region_gene_df)+1)
dim(region_gene_df_log)
p = pheatmap(t(region_gene_df_log))
save_pheatmap_pdf(p,paste0(save_path, 'region_gene_df_all.pdf'),width=11,height=18)
par(bg='white')
plot(p$tree_col)
dev.off()
# select egenes from expr df schizo for
geneset = intersect(dz_genes$schizo, interesting_egenes)#unique(c(dz_genes$schizo,interesting_egenes))
region_gene_df_dz = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df_dz)
# region_gene_df_dz[region_gene_df_dz<=5] = 0
# region_gene_df_dz = remove_lowvar_genes(region_gene_df_dz)
region_gene_df_dz_log = log2(t(region_gene_df_dz)+1)
region_gene_df_dz_log = region_gene_df_dz_log[rowSums(region_gene_df_dz_log)>.1,]
# region_gene_df_dz_log_norm = as.data.frame(scale(region_gene_df_dz_log, center = TRUE, scale = TRUE))
dim(region_gene_df_dz_log)
p = pheatmap(region_gene_df_dz_log)
save_pheatmap_pdf(p,paste0(save_path, 'region_gene_df_schizo.pdf'),width=20,height=10)
par(bg='white')
plot(p$tree_col)
dev.off()
# select egenes from expr df schizo for
geneset = intersect(dz_genes$bipolar, interesting_egenes)#unique(c(dz_genes$schizo,interesting_egenes))
region_gene_df_dz = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df_dz)
# region_gene_df_dz[region_gene_df_dz<=5] = 0
# region_gene_df_dz = remove_lowvar_genes(region_gene_df_dz)
region_gene_df_dz_log = log2(t(region_gene_df_dz)+1)
region_gene_df_dz_log = region_gene_df_dz_log[rowSums(region_gene_df_dz_log)>.1,]
# region_gene_df_dz_log_norm = as.data.frame(scale(region_gene_df_dz_log, center = TRUE, scale = TRUE))
dim(region_gene_df_dz_log)
p = pheatmap(region_gene_df_dz_log)
save_pheatmap_pdf(p,paste0(save_path, 'region_gene_df_bipolar.pdf'),width=20,height=5)
par(bg='white')
plot(p$tree_col)
dev.off()
# select egenes from expr df schizo for
geneset = intersect(dz_genes$depress_updatedhoward, interesting_egenes)#unique(c(dz_genes$schizo,interesting_egenes))
region_gene_df_dz = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df_dz)
# region_gene_df_dz[region_gene_df_dz<=5] = 0
# region_gene_df_dz = remove_lowvar_genes(region_gene_df_dz)
region_gene_df_dz_log = log2(t(region_gene_df_dz)+1)
region_gene_df_dz_log = region_gene_df_dz_log[rowSums(region_gene_df_dz_log)>.1,]
# region_gene_df_dz_log_norm = as.data.frame(scale(region_gene_df_dz_log, center = TRUE, scale = TRUE))
dim(region_gene_df_dz_log)
p = pheatmap(region_gene_df_dz_log)
save_pheatmap_pdf(p,paste0(save_path, 'region_gene_df_depress.pdf'),width=20,height=5)
par(bg='white')
plot(p$tree_col)
dev.off()
# select egenes from expr df schizo for
bpd_geneset = intersect(dz_genes$bipolar, interesting_egenes)#unique(c(dz_genes$schizo,interesting_egenes))
mdd_geneset = intersect(dz_genes$depress_updatedhoward, interesting_egenes)#unique(c(dz_genes$schizo,interesting_egenes))
geneset = union(bpd_geneset, mdd_geneset)
region_gene_df_dz = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df_dz)
# region_gene_df_dz[region_gene_df_dz<=5] = 0
# region_gene_df_dz = remove_lowvar_genes(region_gene_df_dz)
region_gene_df_dz_log = log2(t(region_gene_df_dz)+1)
region_gene_df_dz_log = region_gene_df_dz_log[rowSums(region_gene_df_dz_log)>.1,]
# region_gene_df_dz_log_norm = as.data.frame(scale(region_gene_df_dz_log, center = TRUE, scale = TRUE))
dim(region_gene_df_dz_log)
p = pheatmap(region_gene_df_dz_log)
save_pheatmap_pdf(p,paste0(save_path, 'region_gene_df_bipolardepress.pdf'),width=20,height=5)
par(bg='white')
plot(p$tree_col)
dev.off()
# select egenes from expr df interesting for
geneset = interesting_egenes
region_gene_df_dz = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df_dz)
region_gene_df_dz[region_gene_df_dz<=1] = 0
region_gene_df_dz = remove_lowvar_genes(region_gene_df_dz)
region_gene_df_dz_log = log2(t(region_gene_df_dz)+1)
# region_gene_df_dz_log_norm = as.data.frame(scale(region_gene_df_dz_log, center = TRUE, scale = TRUE))
p = pheatmap(region_gene_df_dz_log)
save_pheatmap_pdf(p,paste0(save_path, 'region_gene_df_vignette.pdf'),width=20,height=15)
par(bg='white')
plot(p$tree_col)
dev.off()
# select egenes from expr df schizo for
bpd_geneset = intersect(dz_genes$bipolar, interesting_egenes)#unique(c(dz_genes$schizo,interesting_egenes))
scz_geneset = intersect(dz_genes$schizo, interesting_egenes)#unique(c(dz_genes$schizo,interesting_egenes))
geneset = union(bpd_geneset, scz_geneset)
region_gene_df_dz = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df_dz)
# region_gene_df_dz[region_gene_df_dz<=5] = 0
# region_gene_df_dz = remove_lowvar_genes(region_gene_df_dz)
region_gene_df_dz_log = log2(t(region_gene_df_dz)+1)
region_gene_df_dz_log = region_gene_df_dz_log[rowSums(region_gene_df_dz_log)>.1,]
# region_gene_df_dz_log_norm = as.data.frame(scale(region_gene_df_dz_log, center = TRUE, scale = TRUE))
dim(region_gene_df_dz_log)
dz_annon = rbind(data.frame(gene = bpd_geneset, disease='BPD'),
data.frame(gene = scz_geneset, disease='SCZ'))
dz_annon = dz_annon = dz_annon%>%
group_by(gene)%>%
mutate(disease = as.character(paste0(disease, collapse = "|")))%>%
distinct()%>%
ungroup()%>%
mutate(gene = as.character(gene))%>%
filter(gene %in% rownames(region_gene_df_dz_log))%>%
column_to_rownames('gene')
# ungroup()%>%
dim(dz_annon)
cell_annon= metadata_df%>%
dplyr::select(cluster_label, class_label)%>%
mutate(cluster_label = str_replace_all(cluster_label, "[ .-]", "."))%>%
distinct()%>%
filter(class_label!='')%>%
filter(cluster_label %in% colnames(region_gene_df_dz_log))%>%
column_to_rownames('cluster_label')
dim(cell_annon)
p = pheatmap(region_gene_df_dz_log,
annotation_row=dz_annon,
annotation_col = cell_annon)
save_pheatmap_pdf(p,paste0(save_path, 'region_gene_df_bipolarschizo.pdf'),width=20,height=12)
par(bg='white')
plot(p$tree_col)
dev.off()
library(ggfortify)
library(cluster)
pca_res <- prcomp(region_gene_df_dz_log)
pca_res_df <- as.data.frame(pca_res$x)
pca_res_df = cbind(dz_annon,pca_res_df)
pca_res_df$gene = rownames(pca_res_df)
ggplot(pca_res_df, aes(x=PC1, y=PC2, label=gene,color=disease))+
geom_point()+
geom_text(position=position_jitter(width=1,height=1))+
theme_classic()+ggtitle('PCA BrainMap Cortical Neuron Type')
ggsave(paste0(save_path, 'pca_bipolarschizo.pdf'))
autoplot(pca_res, label=TRUE, label.size = 3,
loadings = TRUE, loadings.label = TRUE, loadings.label.size = 3)
mono_poly_genes = read.csv('/Users/mguo123/Google Drive/1_khavari/noncancer_project/miseq/novogene_071420/omim_enrichment/mono_poly_overlap_genelist.txt',
stringsAsFactor=F, header=F)$V1
length(mono_poly_genes)
# select egenes and get expression matrix
geneset = mono_poly_genes
region_gene_df_dz = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df_dz)
# region_gene_df_dz[region_gene_df_dz<=5] = 0
# region_gene_df_dz = remove_lowvar_genes(region_gene_df_dz)
region_gene_df_dz_log = log2(t(region_gene_df_dz)+1)
region_gene_df_dz_log = region_gene_df_dz_log[rowSums(region_gene_df_dz_log)>.1,]
# region_gene_df_dz_log_norm = as.data.frame(scale(region_gene_df_dz_log, center = TRUE, scale = TRUE))
print('post expression filter per gene')
dim(region_gene_df_dz_log)
dz_annon = read.csv('/Users/mguo123/Google Drive/1_khavari/noncancer_project/miseq/novogene_071420/D_mpraanalyze_barcode_allelic/egene_gtex/all_string_protein_annotations_w_dz.csv',
stringsAsFactor=F)
dz_annon = dz_annon%>%
dplyr::select(X.node, diseases)%>%
filter(X.node %in% rownames(region_gene_df_dz_log))%>%
column_to_rownames('X.node')
print('dz annon dimensions')
dim(dz_annon)
cell_annon= metadata_df%>%
dplyr::select(cluster_label, class_label)%>%
mutate(cluster_label = str_replace_all(cluster_label, "[ .-]", "."))%>%
distinct()%>%
filter(class_label!='')%>%
filter(cluster_label %in% colnames(region_gene_df_dz_log))%>%
column_to_rownames('cluster_label')
print('cell type annon dimensions')
dim(cell_annon)
p = pheatmap(region_gene_df_dz_log,
annotation_row=dz_annon,
annotation_col = cell_annon)
save_pheatmap_pdf(p,paste0(save_path, 'region_gene_df_mono_poly.pdf'),width=20,height=12)
par(bg='white')
plot(p$tree_col)
dev.off()
library(ggfortify)
library(cluster)
pca_res <- prcomp(region_gene_df_dz_log)
pca_res_df <- as.data.frame(pca_res$x)
# autoplot(pca_res, label=TRUE, label.size = 3)#,
# loadings = TRUE, loadings.label = TRUE, loadings.label.size = 3)
dz_annon
cell_annon
# get expressed cell types
write.table(data.frame(sort(rownames(region_gene_df_dz_log))),
'/Users/mguo123/Google Drive/1_khavari/noncancer_project/miseq/novogene_071420/omim_enrichment/mono_poly_overlap_genelist_scrna_filt.txt',
quote=F, col.names=F, row.names=F)
1za
# select egenes from expr df interesting for
geneset = c('MTHFR', 'AKT1', 'C4A', 'CHRNA2')
region_gene_df_dz = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df_dz)
region_gene_df_dz[region_gene_df_dz<=1] = 0
region_gene_df_dz = remove_lowvar_genes(region_gene_df_dz)
region_gene_df_dz_log = log2(t(region_gene_df_dz)+1)
# region_gene_df_dz_log_norm = as.data.frame(scale(region_gene_df_dz_log, center = TRUE, scale = TRUE))
p = pheatmap(region_gene_df_dz_log)
save_pheatmap_pdf(p,paste0(save_path, 'region_gene_df_omim.pdf'),width=20,height=15)
par(bg='white')
plot(p$tree_col)
dev.off()
# select egenes from expr df
region_gene_df_dz = brainmap_expr_df_region[,dz_genes$schizo[dz_genes$schizo%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df_dz)
region_gene_df_dz[region_gene_df_dz<=5] = 0
region_gene_df_dz_log = log10(t(remove_lowvar_genes(region_gene_df_dz))+1)
region_gene_df_dz_log_norm = as.data.frame(scale(region_gene_df_dz_log, center = TRUE, scale = TRUE))
pheatmap(region_gene_df_dz_log_norm)
# make gmt object
region_gene_df_long_filt = region_gene_df%>%
rownames_to_column('region')%>%
gather(gene,tpm, -region)%>%
filter(tpm>5)%>%
inner_join(all_sym_to_entrez, by=c('gene'='SYMBOL'))
# region_gene_df_long_annon = region_gene_df_long_filt%>%
# column_to_rownames('gene')%>%
dim(region_gene_df_long_filt)
str(region_gene_df_long_filt)
region_gene_gmt = region_gene_df_long_filt%>%
mutate(region = as.factor(region))%>%
dplyr::select(region, ENTREZID)
str(region_gene_gmt)
# compareCluster(geneCluster = dz_gene_list_entrez, fun = "enrichGO",
# pAdjustMethod='BH',
# pvalueCutoff = 1,
# qvalueCutoff = 1,
# OrgDb='org.Hs.eg.db',
# ont = "BP",
# readable=TRUE)
dz_egene_scRNAregion = run_enrichment(entrez_list=dz_gene_list_entrez,
c_gmt = region_gene_gmt,
save_prefix=paste0(save_path, 'dz_egene_scRNAregion'))
No enrichment so just make an annotation in the big table
region_gene_df_long_filt%>%
group_by(gene)%>%tally()%>%arrange(desc(n))